@inproceedings{jung-etal-2022-modal,
title = "Modal-specific Pseudo Query Generation for Video Corpus Moment Retrieval",
author = "Jung, Minjoon and
Choi, SeongHo and
Kim, JooChan and
Kim, Jin-Hwa and
Zhang, Byoung-Tak",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.530",
doi = "10.18653/v1/2022.emnlp-main.530",
pages = "7769--7781",
abstract = "Video corpus moment retrieval (VCMR) is the task to retrieve the most relevant video moment from a large video corpus using a natural language query.For narrative videos, e.g., drama or movies, the holistic understanding of temporal dynamics and multimodal reasoning are crucial.Previous works have shown promising results; however, they relied on the expensive query annotations for the VCMR, i.e., the corresponding moment intervals.To overcome this problem, we propose a self-supervised learning framework: Modal-specific Pseudo Query Generation Network (MPGN).First, MPGN selects candidate temporal moments via subtitle-based moment sampling.Then, it generates pseudo queries exploiting both visualand textual information from the selected temporal moments.Through the multimodal information in the pseudo queries, we show that MPGN successfully learns to localize the video corpus moment without any explicit annotation.We validate the effectiveness of MPGN on TVR dataset, showing the competitive results compared with both supervised models and unsupervised setting models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jung-etal-2022-modal">
<titleInfo>
<title>Modal-specific Pseudo Query Generation for Video Corpus Moment Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Minjoon</namePart>
<namePart type="family">Jung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SeongHo</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">JooChan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jin-Hwa</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Byoung-Tak</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Video corpus moment retrieval (VCMR) is the task to retrieve the most relevant video moment from a large video corpus using a natural language query.For narrative videos, e.g., drama or movies, the holistic understanding of temporal dynamics and multimodal reasoning are crucial.Previous works have shown promising results; however, they relied on the expensive query annotations for the VCMR, i.e., the corresponding moment intervals.To overcome this problem, we propose a self-supervised learning framework: Modal-specific Pseudo Query Generation Network (MPGN).First, MPGN selects candidate temporal moments via subtitle-based moment sampling.Then, it generates pseudo queries exploiting both visualand textual information from the selected temporal moments.Through the multimodal information in the pseudo queries, we show that MPGN successfully learns to localize the video corpus moment without any explicit annotation.We validate the effectiveness of MPGN on TVR dataset, showing the competitive results compared with both supervised models and unsupervised setting models.</abstract>
<identifier type="citekey">jung-etal-2022-modal</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.530</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.530</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>7769</start>
<end>7781</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Modal-specific Pseudo Query Generation for Video Corpus Moment Retrieval
%A Jung, Minjoon
%A Choi, SeongHo
%A Kim, JooChan
%A Kim, Jin-Hwa
%A Zhang, Byoung-Tak
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F jung-etal-2022-modal
%X Video corpus moment retrieval (VCMR) is the task to retrieve the most relevant video moment from a large video corpus using a natural language query.For narrative videos, e.g., drama or movies, the holistic understanding of temporal dynamics and multimodal reasoning are crucial.Previous works have shown promising results; however, they relied on the expensive query annotations for the VCMR, i.e., the corresponding moment intervals.To overcome this problem, we propose a self-supervised learning framework: Modal-specific Pseudo Query Generation Network (MPGN).First, MPGN selects candidate temporal moments via subtitle-based moment sampling.Then, it generates pseudo queries exploiting both visualand textual information from the selected temporal moments.Through the multimodal information in the pseudo queries, we show that MPGN successfully learns to localize the video corpus moment without any explicit annotation.We validate the effectiveness of MPGN on TVR dataset, showing the competitive results compared with both supervised models and unsupervised setting models.
%R 10.18653/v1/2022.emnlp-main.530
%U https://aclanthology.org/2022.emnlp-main.530
%U https://doi.org/10.18653/v1/2022.emnlp-main.530
%P 7769-7781
Markdown (Informal)
[Modal-specific Pseudo Query Generation for Video Corpus Moment Retrieval](https://aclanthology.org/2022.emnlp-main.530) (Jung et al., EMNLP 2022)
ACL